Merge pull request #866 from cantino/website_agent-url_on_receive

Add a url_from_event option to WebsiteAgent

Akinori MUSHA 9 years ago
parent
commit
caa2132b99
3 changed files with 21 additions and 3 deletions
  1. 1 0
      CHANGES.md
  2. 9 3
      app/models/agents/website_agent.rb
  3. 11 0
      spec/models/agents/website_agent_spec.rb

+ 1 - 0
CHANGES.md

@@ -1,5 +1,6 @@
1 1
 # Changes
2 2
 
3
+* Jun 19, 2015   - Add `url_from_event` to WebsiteAgent.
3 4
 * Jun 17, 2015   - RssAgent emits events for new feed items in chronological order.
4 5
 * Jun 15, 2015   - Liquid filter `uri_expand` added.
5 6
 * Jun 12, 2015   - RSSAgent can now accept an array of URLs.

+ 9 - 3
app/models/agents/website_agent.rb

@@ -19,7 +19,7 @@ module Agents
19 19
 
20 20
       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
21 21
 
22
-      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values.
22
+      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values.
23 23
 
24 24
       # Supported Document Types
25 25
 
@@ -135,7 +135,8 @@ module Agents
135 135
 
136 136
     def validate_options
137 137
       # Check for required fields
138
-      errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
138
+      errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present?
139
+      errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
139 140
       if !options['extract'].present? && extraction_type != "json"
140 141
         errors.add(:base, "extract is required for all types except json")
141 142
       end
@@ -257,7 +258,12 @@ module Agents
257 258
     def receive(incoming_events)
258 259
       incoming_events.each do |event|
259 260
         interpolate_with(event) do
260
-          url_to_scrape = event.payload['url']
261
+          url_to_scrape =
262
+            if url_template = options['url_from_event'].presence
263
+              interpolate_string(url_template)
264
+            else
265
+              event.payload['url']
266
+            end
261 267
           check_url(url_to_scrape,
262 268
                     interpolated['mode'].to_s == "merge" ? event.payload : {})
263 269
         end

+ 11 - 0
spec/models/agents/website_agent_spec.rb

@@ -633,6 +633,17 @@ fire: hot
633 633
         }.to change { Event.count }.by(1)
634 634
       end
635 635
 
636
+      it "should use url_from_event as url to scrape if it exists when receiving an event" do
637
+        stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
638
+
639
+        @checker.options = @valid_options.merge(
640
+          'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
641
+        )
642
+        @checker.receive([@event])
643
+
644
+        expect(stub).to have_been_requested
645
+      end
646
+
636 647
       it "should interpolate values from incoming event payload" do
637 648
         expect {
638 649
           @valid_options['extract'] = {